In [1]:
import chardet
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
In [2]:
# Specify the path to the CSV file
csv_file = "C:\datasets\Auto Dataset.csv"
# Detect the encoding of the CSV file
with open(csv_file, 'rb') as f:
    result = chardet.detect(f.read())
# Print the detected encoding
print(result['encoding'])
Windows-1252
In [3]:
# Specify the path to the CSV file
csv_file = "C:\datasets\Auto Dataset.csv"
# Read the CSV file with the detected encoding
df = pd.read_csv(csv_file, encoding='Windows-1252')
print(df.head())
# Check the column names and data types
print(df.columns)
print(df.dtypes)
       dateCrawled                                               name  seller  \
0  3/26/2016 17:47                   Peugeot_807_160_NAVTECH_ON_BOARD  privat   
1   4/4/2016 13:38         BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik  privat   
2  3/26/2016 18:57                         Volkswagen_Golf_1.6_United  privat   
3  3/12/2016 16:58  Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...  privat   
4   4/1/2016 14:38  Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...  privat   

  offerType    price   abtest vehicleType  yearOfRegistration    gearbox  \
0   Angebot  $5,000   control         bus                2004    manuell   
1   Angebot  $8,500   control   limousine                1997  automatik   
2   Angebot  $8,990      test   limousine                2009    manuell   
3   Angebot  $4,350   control  kleinwagen                2007  automatik   
4   Angebot  $1,350      test       kombi                2003    manuell   

   powerPS   model   odometer  monthOfRegistration fuelType       brand  \
0      158  andere  150,000km                    3      lpg     peugeot   
1      286     7er  150,000km                    6   benzin         bmw   
2      102    golf   70,000km                    7   benzin  volkswagen   
3       71  fortwo   70,000km                    6   benzin       smart   
4        0   focus  150,000km                    7   benzin        ford   

  notRepairedDamage     dateCreated  nrOfPictures  postalCode        lastSeen  
0              nein  3/26/2016 0:00             0       79588   4/6/2016 6:45  
1              nein   4/4/2016 0:00             0       71034  4/6/2016 14:45  
2              nein  3/26/2016 0:00             0       35394  4/6/2016 20:15  
3              nein  3/12/2016 0:00             0       33729  3/15/2016 3:16  
4              nein   4/1/2016 0:00             0       39218  4/1/2016 14:38  
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')
dateCrawled            object
name                   object
seller                 object
offerType              object
price                  object
abtest                 object
vehicleType            object
yearOfRegistration      int64
gearbox                object
powerPS                 int64
model                  object
odometer               object
monthOfRegistration     int64
fuelType               object
brand                  object
notRepairedDamage      object
dateCreated            object
nrOfPictures            int64
postalCode              int64
lastSeen               object
dtype: object
In [4]:
# Convert the 'price' column to numeric by removing characters like '$' and ','
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

# Drop irrelevant columns and columns with non-numeric or date-like data
drop_columns = ['dateCrawled', 'name', 'seller', 'offerType', 'abtest', 'model', 'lastSeen']
df.drop(columns=drop_columns, inplace=True)

# Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, drop_first=True)
In [5]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Random Forest Regressor
model =  RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.8012865492695206
Accuracy on testing dataset -14.141918456127756
In [6]:
model =  RandomForestRegressor(n_estimators=100, random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.001008273339773269
Accuracy on testing dataset -0.00721577271035545
In [12]:
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)
Mean Squared Error: 14255937322.490925
R2 Score: -0.00721577271035545
In [7]:
print("Feature importance: \n", model.feature_importances_)
Feature importance: 
 [0.24 0.22 0.   0.   0.03 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.1  0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.38 0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.02 0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.  ]
In [9]:
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
In [13]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Gradient Boosting Regressor
model =  GradientBoostingRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.8424825900005556
Accuracy on testing dataset -0.9874505787618353
In [19]:
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)
Mean Squared Error: 28129991259.107037
R2 Score: -0.9874505787618353
In [17]:
print("Feature importance: \n", model.feature_importances_)
Feature importance: 
 [6.77623690e-02 9.89112861e-02 2.41586681e-03 0.00000000e+00
 5.34428542e-02 0.00000000e+00 0.00000000e+00 3.01329825e-06
 0.00000000e+00 2.27272565e-05 7.21735076e-03 0.00000000e+00
 9.40271086e-05 0.00000000e+00 0.00000000e+00 7.18401207e-05
 0.00000000e+00 0.00000000e+00 2.94354509e-02 0.00000000e+00
 1.55252966e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.69450391e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.46169790e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.32992764e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.54330198e-04 0.00000000e+00
 2.00704087e-04 0.00000000e+00 4.31511406e-03 0.00000000e+00
 3.61924091e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.42617142e-06
 1.75398241e-05 0.00000000e+00 0.00000000e+00 7.17988432e-11
 5.60342886e-04 4.56721463e-03 1.62381015e-02 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.84629629e-04 0.00000000e+00 0.00000000e+00 4.38499090e-10
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.89695123e-01
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 9.67802968e-04 0.00000000e+00
 0.00000000e+00 7.86742666e-06 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 8.09185939e-03 3.53930637e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]
In [18]:
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
In [21]:
# Create and train the Decision Tree Classifier
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9999999995329204
Accuracy on testing dataset -1.0779170076297566
In [22]:
# Create and train the Decision Tree Classifier
model = DecisionTreeRegressor(random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.00065387246845372
Accuracy on testing dataset -0.004053662111415779
In [23]:
print("Feature importance: \n", model.feature_importances_)
Feature importance: 
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
In [28]:
from sklearn import tree

fig = plt.figure(figsize=(25, 15))
_ = tree.plot_tree(model, filled=True, rounded=True,
                   feature_names=X.columns,
                   class_names=["price"])
In [29]:
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
In [ ]: